/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <cutil_inline.h>    // includes cuda.h and cuda_runtime_api.h
#include <math.h>

#if defined(__APPLE__) || defined(MACOSX)
#include <GLUT/glut.h>
#else
#include <GL/freeglut.h>
#endif

#include <cuda_gl_interop.h>
#include "cutil_math.h"

__constant__ float softeningSquared;
__constant__ double softeningSquared_fp64;



#include "thrust/device_vector.h"
#include "thrust/device_ptr.h"
#include "thrust/reduce.h"

thrust::device_vector<float4> thrust_force4;
float4* force_array;
void allocateForceVector(int count){
  thrust_force4.resize(count);
  force_array = ((float4*) thrust::raw_pointer_cast(&thrust_force4[0]));
}

template<class T>
struct SharedMemory
{
  __device__ inline operator       T*()
  {
    extern __shared__ int __smem[];
    return (T*)__smem;
  }

  __device__ inline operator const T*() const
  {
    extern __shared__ int __smem[];
    return (T*)__smem;
  }
};


#define max_boundary 50.f

template <typename T> struct vec3         { typedef float   Type; }; // dummy 
template <>           struct vec3<float>  { typedef float3  Type; };
template <>           struct vec3<double> { typedef double3 Type; };

template <typename T> struct vec4         { typedef float   Type; }; // dummy 
template <>           struct vec4<float>  { typedef float4  Type; };
template <>           struct vec4<double> { typedef double4 Type; };

template<typename T>
__device__ T rsqrt_T(T x) { return rsqrt(x); }

template<>
__device__ float rsqrt_T<float>(float x) { return rsqrtf(x); }


// Macros to simplify shared memory addressing
#define SX(i) sharedPos[i+blockDim.x*threadIdx.y]
// This macro is only used when multithreadBodies is true (below)
#define SX_SUM(i,j) sharedPos[i+blockDim.x*j]

template <typename T>
__device__ T getSofteningSquared() { return softeningSquared; }
template <>
__device__ double getSofteningSquared<double>()   { return softeningSquared_fp64; }

template <typename T>
struct DeviceData {
  T* dPos[2]; // mapped host pointers
  T* dVel;
  cudaEvent_t  event;
  unsigned int offset;
  unsigned int numBodies;
};


template <typename T>
__device__ typename vec3<T>::Type 
bodyBodyInteraction(typename vec3<T>::Type ai, 
                    typename vec4<T>::Type bi, 
                    typename vec4<T>::Type bj) 
{
  typename vec3<T>::Type r;

  // r_ij  [3 FLOPS]
  r.x = bj.x - bi.x;
  r.y = bj.y - bi.y;
  r.z = bj.z - bi.z;

  // distSqr = dot(r_ij, r_ij) + EPS^2  [6 FLOPS]
  T distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
  distSqr += getSofteningSquared<T>();

  // invDistCube =1/distSqr^(3/2)  [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
  T invDist = rsqrt_T(distSqr);
  T invDistCube =  invDist * invDist * invDist;

  // s = m_j * invDistCube [1 FLOP]
  T s = bj.w * invDistCube;

  // a_i =  a_i + s * r_ij [6 FLOPS]
  ai.x += r.x * s;
  ai.y += r.y * s;
  ai.z += r.z * s;

  return ai;
}
template <typename T>
__device__ typename vec3<T>::Type 
bodyBodySpecialInteraction(typename vec3<T>::Type ai, 
			   typename vec4<T>::Type bi, 
			   typename vec4<T>::Type bj, 
			   T softsq, 
			   T& special) 
{
  typename vec3<T>::Type r;

  // r_ij  [3 FLOPS]
  r.x = bj.x - bi.x;
  r.y = bj.y - bi.y;
  r.z = bj.z - bi.z;

  // distSqr = dot(r_ij, r_ij) + EPS^2  [6 FLOPS]
  T distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
  distSqr += softsq;
  special = distSqr;

  // invDistCube =1/distSqr^(3/2)  [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
  T invDist = rsqrt_T(distSqr);
  T invDistCube =  invDist * invDist * invDist;

  // s = m_j * invDistCube [1 FLOP]
  T s = bj.w * invDistCube;

  // a_i =  a_i + s * r_ij [6 FLOPS]
  ai.x += r.x * s;
  ai.y += r.y * s;
  ai.z += r.z * s;

  return ai;
}

// This is the "tile_calculation" function from the GPUG3 article.
template <typename T>
__device__ typename vec3<T>::Type 
gravitation(typename vec4<T>::Type iPos, 
            typename vec3<T>::Type accel)
{
  typename vec4<T>::Type* sharedPos = SharedMemory<typename vec4<T>::Type>();

  // The CUDA 1.1 compiler cannot determine that i is not going to 
  // overflow in the loop below.  Therefore if int is used on 64-bit linux 
  // or windows (or long instead of long long on win64), the compiler
  // generates suboptimal code.  Therefore we use long long on win64 and
  // long on everything else. (Workaround for Bug ID 347697)
#ifdef _Win64
  unsigned long long j = 0;
#else
  unsigned long j = 0;
#endif

  // Here we unroll the loop to reduce bookkeeping instruction overhead
  // 32x unrolling seems to provide best performance

  // Note that having an unsigned int loop counter and an unsigned
  // long index helps the compiler generate efficient code on 64-bit
  // OSes.  The compiler can't assume the 64-bit index won't overflow
  // so it incurs extra integer operations.  This is a standard issue
  // in porting 32-bit code to 64-bit OSes.

#pragma unroll 32
  for (unsigned int counter = 0; counter < blockDim.x; counter++ ) 
    {
      accel = bodyBodyInteraction<T>(accel, iPos, SX(j++)); 
    }

  return accel;
}

// WRAP is used to force each block to start working on a different 
// chunk (and wrap around back to the beginning of the array) so that
// not all multiprocessors try to read the same memory locations at 
// once.
#define WRAP(x,m) (((x)<m)?(x):(x-m))  // Mod without divide, works on values from 0 up to 2m

template <typename T, bool multithreadBodies>
__device__ typename vec3<T>::Type
computeBodyAccel(typename vec4<T>::Type bodyPos, 
                 typename vec4<T>::Type* positions, 
                 int numBodies)
{
  typename vec4<T>::Type* sharedPos = SharedMemory<typename vec4<T>::Type>();

  typename vec3<T>::Type acc = {0.0f, 0.0f, 0.0f};

  int p = blockDim.x;
  int q = blockDim.y;
  int n = numBodies;
  int numTiles = n / (p * q);

  for (int tile = blockIdx.y; tile < numTiles + blockIdx.y; tile++) 
    {
      sharedPos[threadIdx.x+blockDim.x*threadIdx.y] = 
	multithreadBodies ? 
	positions[WRAP(blockIdx.x + q * tile + threadIdx.y, gridDim.x) * p + threadIdx.x] :
        positions[WRAP(blockIdx.x + tile,                   gridDim.x) * p + threadIdx.x];

      __syncthreads();

      // This is the "tile_calculation" function from the GPUG3 article.
      acc = gravitation<T>(bodyPos, acc);

      __syncthreads();
    }

  // When the numBodies / thread block size is < # multiprocessors (16 on G80), the GPU is 
  // underutilized.  For example, with a 256 threads per block and 1024 bodies, there will only 
  // be 4 thread blocks, so the GPU will only be 25% utilized. To improve this, we use multiple 
  // threads per body.  We still can use blocks of 256 threads, but they are arranged in q rows 
  // of p threads each.  Each thread processes 1/q of the forces that affect each body, and then 
  // 1/q of the threads (those with threadIdx.y==0) add up the partial sums from the other 
  // threads for that body.  To enable this, use the "--p=" and "--q=" command line options to 
  // this example. e.g.: "nbody.exe --n=1024 --p=64 --q=4" will use 4 threads per body and 256 
  // threads per block. There will be n/p = 16 blocks, so a G80 GPU will be 100% utilized.

  // We use a bool template parameter to specify when the number of threads per body is greater 
  // than one, so that when it is not we don't have to execute the more complex code required!
  if (multithreadBodies)
    {
      SX_SUM(threadIdx.x, threadIdx.y).x = acc.x;
      SX_SUM(threadIdx.x, threadIdx.y).y = acc.y;
      SX_SUM(threadIdx.x, threadIdx.y).z = acc.z;

      __syncthreads();

      // Save the result in global memory for the integration step
      if (threadIdx.y == 0) 
        {
	  for (int i = 1; i < blockDim.y; i++) 
            {
	      acc.x += SX_SUM(threadIdx.x,i).x;
	      acc.y += SX_SUM(threadIdx.x,i).y;
	      acc.z += SX_SUM(threadIdx.x,i).z;
            }
        }
    }

  return acc;
}

template<typename T, bool multithreadBodies>
__global__ void
integrateBodies(typename vec4<T>::Type* newPos, 
                typename vec4<T>::Type* oldPos, 
                typename vec4<T>::Type* vel,
                unsigned int deviceOffset, unsigned int deviceNumBodies,
                float deltaTime, float damping, int totalNumBodies, 
		typename vec4<T>::Type h_pos, 
		T p_softsq, 
		T h_softsq,
		float4* h_force_array)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;

  if (index >= deviceNumBodies)
    return;

  typename vec4<T>::Type position = oldPos[deviceOffset + index];  

  typename vec3<T>::Type accel = computeBodyAccel<T, multithreadBodies>(position, oldPos, totalNumBodies);

  typename vec3<T>::Type h_accel = {0.0f, 0.0f, 0.0f};
  T rsqr;
  h_accel = bodyBodySpecialInteraction<T>(h_accel, position, h_pos, p_softsq, rsqr);
  
  h_force_array[index].x = h_accel.x*rsqr/(rsqr+h_softsq);
  h_force_array[index].y = h_accel.y*rsqr/(rsqr+h_softsq);
  h_force_array[index].z = h_accel.z*rsqr/(rsqr+h_softsq);

  accel.x += h_accel.x;
  accel.y += h_accel.y;
  accel.z += h_accel.z;


  if (!multithreadBodies || (threadIdx.y == 0))
    {
      // acceleration = force \ mass; 
      // new velocity = old velocity + acceleration * deltaTime
      // note we factor out the body's mass from the equation, here and in bodyBodyInteraction 
      // (because they cancel out).  Thus here force == acceleration
      typename vec4<T>::Type velocity = vel[deviceOffset + index];

      velocity.x += accel.x * deltaTime;
      velocity.y += accel.y * deltaTime;
      velocity.z += accel.z * deltaTime;  

      velocity.x *= damping;
      velocity.y *= damping;
      velocity.z *= damping;

      // new position = old position + velocity * deltaTime
      position.x += velocity.x * deltaTime;
      position.y += velocity.y * deltaTime;
      position.z += velocity.z * deltaTime;
	
      velocity.x = (abs(position.x)>max_boundary)?-velocity.x*0.9:velocity.x;
      velocity.y = (abs(position.y)>max_boundary)?-velocity.y*0.9:velocity.y;
      velocity.z = (abs(position.z)>max_boundary)?-velocity.z*0.9:velocity.z;
      position.x = clamp(position.x,-max_boundary, max_boundary);
      position.y = clamp(position.y,-max_boundary, max_boundary);
      position.z = clamp(position.z,-max_boundary, max_boundary);

      // store new position and velocity
      newPos[deviceOffset + index] = position;
      vel[deviceOffset + index]    = velocity;
    }
}
struct fuckoff{
  __host__ __device__ bool operator()(float4 a, float4 b){
    return length(a)<length(b);
  }
};

template <typename T>
void integrateNbodySystem(/*input*/
			  DeviceData<T> *deviceData,
                          cudaGraphicsResource **pgres, 
                          unsigned int currentRead,
                          float deltaTime, 
                          float damping, 
                          unsigned int numBodies, 
                          unsigned int numDevices,
                          int p, 
                          int q,
                          bool bUsePBO, 
			  float h_x, float h_y, float h_z, float h_mass,
			  /*output*/
			  float4 &result)
{
  if (bUsePBO)
    {
      cutilSafeCall(cudaGraphicsResourceSetMapFlags(pgres[currentRead], cudaGraphicsMapFlagsReadOnly));     
      cutilSafeCall(cudaGraphicsResourceSetMapFlags(pgres[1-currentRead], cudaGraphicsMapFlagsWriteDiscard));     
      cutilSafeCall(cudaGraphicsMapResources(2, pgres, 0));
      size_t bytes;
      cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&(deviceData[0].dPos[currentRead]), &bytes, pgres[currentRead]));
      cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&(deviceData[0].dPos[1-currentRead]), &bytes, pgres[1-currentRead]));
    }

  dim3 threads(p,q,1);

  cudaDeviceProp props;

  for (unsigned int dev = 0; dev != numDevices; dev++)
    {
      if (numDevices > 1)
	cudaSetDevice(dev);

      cutilSafeCall(cudaGetDeviceProperties(&props, dev));

      while ((deviceData[dev].numBodies > 0) && 
	     (deviceData[dev].numBodies / p < (unsigned)props.multiProcessorCount))
        {
	  p /= 2;
	  q *= 2;
        }
        
      dim3 grid((deviceData[dev].numBodies + (p-1))/p, 1, 1);

      // execute the kernel:

      // When the numBodies / thread block size is < # multiprocessors 
      // (16 on G80), the GPU is underutilized. For example, with 256 threads per
      // block and 1024 bodies, there will only be 4 thread blocks, so the 
      // GPU will only be 25% utilized.  To improve this, we use multiple threads
      // per body.  We still can use blocks of 256 threads, but they are arranged
      // in q rows of p threads each.  Each thread processes 1/q of the forces 
      // that affect each body, and then 1/q of the threads (those with 
      // threadIdx.y==0) add up the partial sums from the other threads for that 
      // body.  To enable this, use the "--p=" and "--q=" command line options to
      // this example.  e.g.: "nbody.exe --n=1024 --p=64 --q=4" will use 4 
      // threads per body and 256 threads per block. There will be n/p = 16 
      // blocks, so a G80 GPU will be 100% utilized.

      // We use a bool template parameter to specify when the number of threads 
      // per body is greater than one, so that when it is not we don't have to 
      // execute the more complex code required!
      int sharedMemSize = p * q * 4 * sizeof(T); // 4 floats for pos

      typename vec4<T>::Type h_pos;
      h_pos.x=h_x;
      h_pos.y=h_y;
      h_pos.z=h_z;
      h_pos.w=h_mass;
      

      if (grid.x > 0 && threads.y == 1)
        {
	  integrateBodies<T, false><<< grid, threads, sharedMemSize >>>
	    ((typename vec4<T>::Type*)deviceData[dev].dPos[1-currentRead], 
	     (typename vec4<T>::Type*)deviceData[dev].dPos[currentRead], 
	     (typename vec4<T>::Type*)deviceData[dev].dVel,
	     deviceData[dev].offset, deviceData[dev].numBodies,
	     deltaTime, damping, numBodies, 
	     h_pos, 
	     h_mass<0?0.1f:70.f,
	     h_mass<0?0.1f:256.f,
	     force_array);
        }
      else if (grid.x > 0)
        {
	  integrateBodies<T, true><<< grid, threads, sharedMemSize >>>
	    ((typename vec4<T>::Type*)deviceData[dev].dPos[1-currentRead], 
	     (typename vec4<T>::Type*)deviceData[dev].dPos[currentRead], 
	     (typename vec4<T>::Type*)deviceData[dev].dVel,
	     deviceData[dev].offset, deviceData[dev].numBodies,
	     deltaTime, damping, numBodies, 
	     h_pos, 
	     h_mass<0?0.3f:200.f,
	     h_mass<0?10.f:300.f,
	     force_array);
        }
        
      if (numDevices > 1)
        {
	  cutilSafeCall(cudaEventRecord(deviceData[dev].event));
	  // MJH: Hack on older driver versions to force kernel launches to flush!
	  cudaStreamQuery(0);
        }


      result = thrust::reduce(thrust_force4.begin(), thrust_force4.end(), make_float4(0.0f), thrust::plus<float4>());
      //thrust::device_vector<float4>::iterator max_force = thrust::max_element(thrust_force4.begin(), thrust_force4.end(), fuckoff());
      //      printf("Total %f\tmax %f\n",length(result), length(*max_force));

      // check if kernel invocation generated an error
      cutilCheckMsg("Kernel execution failed");
    }

  if (numDevices > 1)
    {
      for (unsigned int dev = 0; dev < numDevices; dev++)
	cutilSafeCall(cudaEventSynchronize(deviceData[dev].event));
    }

  if (bUsePBO)
    {
      cutilSafeCall(cudaGraphicsUnmapResources(2, pgres, 0));
    }
}


// Explicit specializations needed to generate code
template void integrateNbodySystem<float>(DeviceData<float>* deviceData,
                                          cudaGraphicsResource** pgres,
                                          unsigned int currentRead,
                                          float deltaTime, 
                                          float damping, 
                                          unsigned int numBodies, 
                                          unsigned int numDevices,
                                          int p, int q,
                                          bool bUsePBO, 
					  float h_x, float h_y, float h_z, float h_mass, 
					  float4 &result);

template void integrateNbodySystem<double>(DeviceData<double>* deviceData,
                                           cudaGraphicsResource** pgres,
                                           unsigned int currentRead,
                                           float deltaTime, 
                                           float damping, 
                                           unsigned int numBodies, 
                                           unsigned int numDevices,
                                           int p, int q,
                                           bool bUsePBO,
					   float h_x, float h_y, float h_z, float h_mass,
					   float4 &result);
